# -*- coding: utf-8 -*-
"""
This code generates seeds for the waste LDA. Run this code only once, or else
the seeds will be overwritten and you will not be able to replicate your results

@author: samir
"""

import pandas as pd
import topicmodels
import topicmodels.preprocess
import os
import numpy as np
from paths import target_path, waste_LDA_path #importing target_path from paths.py


def main():
    print("Running: Waste LDA Seeds")

if __name__ == "__main__":
    main()

# Change to relevant directory
if os.path.exists(target_path):
    os.chdir(target_path)
else:
    print(f"Warning: {target_path} does not exist!")
    
#Import data
dataframe = pd.read_csv("Waste_Subset/Processed_Data/Waste_Complaints_lemmatized.csv", encoding="utf-8")
dataframe = dataframe.replace(np.nan, '', regex=True)
docsobj = topicmodels.RawDocs(dataframe.IncidentDescriptionLemma, "long")

#Preprocess lemmatized complaints
docsobj.token_clean(1)
docsobj.stopword_remove("tokens")

###Change directory to save LDA Outputs in correct folder
os.chdir(waste_LDA_path)
docsobj.term_rank("tokens")

#Perform tf-idf removal
docsobj.rank_remove("tfidf", "tokens", docsobj.tfidf_ranking[6000][1])

###Generate seed used in manuscript
#Seed 3
ldaobj3 = topicmodels.LDA.LDAGibbs(docsobj.tokens, 10)
seed3 = ldaobj3.topic_seed
np.savetxt('6000_tfidf_10_Topics_seed_3/seed.csv', seed3)